{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 特征提取\n",
    "import numpy as np\n",
    "from sklearn import preprocessing\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  1.,  0.],\n",
       "       [ 0.,  0.,  1.],\n",
       "       [ 1.,  0.,  0.]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "onehot_encoder = DictVectorizer()\n",
    "instances = [{'city': '北京'},{'city': '天津'}, {'city': '上海'}]\n",
    "onehot_encoder.fit_transform(instances).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1 1 0 1 0 1 0 1]\n",
      " [1 1 1 0 1 0 1 0]]\n",
      "{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}\n"
     ]
    }
   ],
   "source": [
    "corpus = ['UNC played Duke in basketball',\n",
    "          'Duke lost the basketball game' ]\n",
    "vectorizer = CountVectorizer()\n",
    "print(vectorizer.fit_transform(corpus).todense())\n",
    "print(vectorizer.vocabulary_) # 单词对应数组的位置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.75458397  0.37729199  0.53689271  0.          0.        ]\n",
      " [ 0.          0.          0.44943642  0.6316672   0.6316672 ]]\n",
      "{'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}\n"
     ]
    }
   ],
   "source": [
    "corpus = ['The dog ate a sandwich and I ate a sandwich',\n",
    "          'The wizard transfigured a sandwich' ]\n",
    "vectorizer = TfidfVectorizer(stop_words='english')\n",
    "print(vectorizer.fit_transform(corpus).todense()) # 带权重信息\n",
    "print(vectorizer.vocabulary_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.        , -0.70710678, -1.38873015,  0.52489066,  0.59299945,\n",
       "        -1.35873244],\n",
       "       [ 0.        , -0.70710678,  0.46291005,  0.87481777,  0.81537425,\n",
       "         1.01904933],\n",
       "       [ 0.        ,  1.41421356,  0.9258201 , -1.39970842, -1.4083737 ,\n",
       "         0.33968311]])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = np.array([[0, 0, 5, 13, 9, 1],\n",
    "              [0, 0, 13, 15, 10, 15],\n",
    "              [0, 3, 15, 2, 0, 11]],\n",
    "             dtype='float64')\n",
    "x_scaled = preprocessing.scale(x) # scale是对每一列数据进行的\n",
    "x_scaled"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,\n",
       "         0.00000000e+00,   0.00000000e+00,   3.70074342e-17])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_scaled.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.,  1.,  1.,  1.,  1.,  1.])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_scaled.std(axis=0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
